{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)\n",
    "\n",
    "<img src=\"img/lda.png\" width=\"600\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "brew install apache-spark\n",
    "pip3 install findspark\n",
    "\"\"\"\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "import findspark\n",
    "findspark.init()\n",
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession, Row\n",
    "from pyspark.sql.types import StringType, ArrayType\n",
    "from pyspark.ml.feature import CountVectorizer, IDF\n",
    "from pyspark.ml.clustering import LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "N_TOPICS = 20\n",
    "MAX_TERMS = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 1: bioinformatics astronomy science microbiology statistical\n",
      "Topic 2: engineering statistics thermodynamics online student\n",
      "Topic 3: evolution cultural actuarial norton systems\n",
      "Topic 4: buddhism aerodynamics relativity database ethics\n",
      "Topic 5: accounting sociology religions social modern\n",
      "Topic 6: east feminism asian readings physical\n",
      "Topic 7: anatomy perspective basic connect islam\n",
      "Topic 8: anthology exploring earth insurance literature\n",
      "Topic 9: design forensic nutrition mechanics microprocessor\n",
      "Topic 10: general handbook philosophy medicine first\n",
      "Topic 11: real analysis finance corporate pathophysiology\n",
      "Topic 12: computer world organization greek geography\n",
      "Topic 13: american history political latin science\n",
      "Topic 14: biology machine organic research chemistry\n",
      "Topic 15: language natural international processing systems\n",
      "Topic 16: practice security theory sciences nursing\n",
      "Topic 17: data integrated early design structures\n",
      "Topic 18: quantum mechanics business solutions communication\n",
      "Topic 19: applied differential equations problems management\n",
      "Topic 20: manual clinical laboratory immunology understanding\n"
     ]
    }
   ],
   "source": [
    "stopwords = set(stopwords.words('english')).union({\n",
    "    'introduction', 'edition', 'series', 'application',\n",
    "    'approach', 'card', 'access', 'package', 'plus', 'etext',\n",
    "    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',\n",
    "    'third', 'second', 'fourth'})\n",
    "\n",
    "sc = SparkContext('local', 'nlp')\n",
    "lines = sc.textFile('data/all_book_titles.txt')\n",
    "lines = lines \\\n",
    "    .map(lambda line: line.strip().lower()) \\\n",
    "    .map(lambda line: line.split()) \\\n",
    "    .map(lambda words: [w for w in words if w.isalpha()]) \\\n",
    "    .map(lambda words: [w for w in words if len(w) > 3]) \\\n",
    "    .map(lambda words: [w for w in words if w not in stopwords]) \\\n",
    "    .zipWithIndex()\n",
    "\n",
    "sess = SparkSession.builder.appName('nlp').getOrCreate()\n",
    "df = sess.createDataFrame(lines, ['words', 'idx'])\n",
    "\n",
    "cv = CountVectorizer(inputCol='words',\n",
    "                     outputCol='tf')\n",
    "cv = cv.fit(df)\n",
    "df = cv.transform(df)\n",
    "df = IDF(inputCol='tf',\n",
    "         outputCol='tfidf').fit(df).transform(df)\n",
    "\n",
    "lda = LDA(k=N_TOPICS,\n",
    "          featuresCol='tfidf',\n",
    "          optimizer='em').fit(df)\n",
    "\n",
    "for i, indices in enumerate(lda.describeTopics(MAX_TERMS).toPandas().termIndices):\n",
    "    print('Topic %d:'%(i+1), ' '.join([cv.vocabulary[idx] for idx in indices]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
